In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import hvplot.pandas
from scipy import stats
import sklearn 
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
In [2]:
sales=pd.read_csv("C:\\Users\\MAHFOOZ ALAM\\Desktop\\DATASET FOR USAGE\\Sales Prediction.csv")
In [3]:
sales.head()
Out[3]:
TV Radio Newspaper Sales
0 230.1 37.8 69.2 22.1
1 44.5 39.3 45.1 10.4
2 17.2 45.9 69.3 12.0
3 151.5 41.3 58.5 16.5
4 180.8 10.8 58.4 17.9
In [4]:
sales.columns
Out[4]:
Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')
In [5]:
sales.shape
Out[5]:
(200, 4)
In [6]:
sales.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB
In [7]:
sales.describe()
Out[7]:
TV Radio Newspaper Sales
count 200.000000 200.000000 200.000000 200.000000
mean 147.042500 23.264000 30.554000 15.130500
std 85.854236 14.846809 21.778621 5.283892
min 0.700000 0.000000 0.300000 1.600000
25% 74.375000 9.975000 12.750000 11.000000
50% 149.750000 22.900000 25.750000 16.000000
75% 218.825000 36.525000 45.100000 19.050000
max 296.400000 49.600000 114.000000 27.000000
In [8]:
sales.Sales.value_counts()
Out[8]:
11.9    5
16.7    5
20.7    4
11.0    3
11.3    3
       ..
13.4    1
24.2    1
8.1     1
5.5     1
25.5    1
Name: Sales, Length: 121, dtype: int64
In [9]:
sns.countplot(sales['Sales'])
Out[9]:
<Axes: ylabel='count'>
In [10]:
sales.isna().sum()
Out[10]:
TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64
In [11]:
data_categorical=sales.loc[:,sales.dtypes== object]
data_categorical
Out[11]:
0
1
2
3
4
...
195
196
197
198
199

200 rows × 0 columns

In [12]:
data_numerical = sales.select_dtypes(exclude=[object])
data_numerical
Out[12]:
TV Radio Newspaper Sales
0 230.1 37.8 69.2 22.1
1 44.5 39.3 45.1 10.4
2 17.2 45.9 69.3 12.0
3 151.5 41.3 58.5 16.5
4 180.8 10.8 58.4 17.9
... ... ... ... ...
195 38.2 3.7 13.8 7.6
196 94.2 4.9 8.1 14.0
197 177.0 9.3 6.4 14.8
198 283.6 42.0 66.2 25.5
199 232.1 8.6 8.7 18.4

200 rows × 4 columns

In [13]:
df_dup=sales.duplicated().any()
df_dup
Out[13]:
False
In [14]:
fig, axs = plt.subplots(3, figsize = (5,5))
plt1 = sns.boxplot(sales['TV'], ax = axs[0])
plt2 = sns.boxplot(sales['Newspaper'], ax = axs[1])
plt3 = sns.boxplot(sales['Radio'], ax = axs[2])
plt.tight_layout()
In [15]:
sns.boxplot(sales['Sales'])
plt.show()
In [16]:
sns.pairplot(sales, x_vars=['TV', 'Newspaper', 'Radio'], y_vars='Sales', height=4, aspect=1, kind='scatter')
Out[16]:
<seaborn.axisgrid.PairGrid at 0x16936f87370>
In [17]:
plt.figure(figsize=(10,7))
sns.heatmap(sales.corr(), annot = True)
Out[17]:
<Axes: >
In [18]:
X = sales['TV']
y = sales['Sales']
In [19]:
import statsmodels.api as sm 
In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
In [21]:
lr=LinearRegression()
In [22]:
X_train_sm = sm.add_constant(X_train)

lr = sm.OLS(y_train, X_train_sm).fit()
In [23]:
X_test_sm = sm.add_constant(X_test)

y_pred = lr.predict(X_test_sm)
In [24]:
lr.params
Out[24]:
const    7.206555
TV       0.054835
dtype: float64
In [25]:
print(lr.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  Sales   R-squared:                       0.800
Model:                            OLS   Adj. R-squared:                  0.798
Method:                 Least Squares   F-statistic:                     550.7
Date:                Sat, 26 Aug 2023   Prob (F-statistic):           5.08e-50
Time:                        18:37:15   Log-Likelihood:                -314.94
No. Observations:                 140   AIC:                             633.9
Df Residuals:                     138   BIC:                             639.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.2066      0.414     17.392      0.000       6.387       8.026
TV             0.0548      0.002     23.467      0.000       0.050       0.059
==============================================================================
Omnibus:                        1.138   Durbin-Watson:                   2.351
Prob(Omnibus):                  0.566   Jarque-Bera (JB):                1.240
Skew:                          -0.190   Prob(JB):                        0.538
Kurtosis:                       2.739   Cond. No.                         376.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [26]:
plt.scatter(X_test, y_test)
plt.plot(X_test, 7.2066 + 0.0543*X_test, 'r')
plt.show()
In [27]:
import sklearn.metrics as metrics
print('MAE: {}'.format(metrics.mean_absolute_error(y_test, y_pred)))
print('MSE: {}'.format(metrics.mean_squared_error(y_test, y_pred)))
print('RMSE: {}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))
MAE: 1.806912377664152
MSE: 5.1795254021666555
RMSE: 2.2758570698017606
In [28]:
sns.distplot((y_test-y_pred))
C:\Users\MAHFOOZ ALAM\AppData\Local\Temp\ipykernel_12304\2077165099.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot((y_test-y_pred))
Out[28]:
<Axes: ylabel='Density'>
In [29]:
res = (y_test - y_pred)
In [30]:
 plt.scatter(X_test,res)
plt.show()
In [31]:
X_test_sm = sm.add_constant(X_test)

y_pred = lr.predict(X_test_sm)
In [32]:
y_pred.head()
Out[32]:
95     16.161091
15     17.921291
30     23.267692
158     7.848123
128    19.286679
dtype: float64
In [33]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
In [34]:
np.sqrt(mean_squared_error(y_test, y_pred))
Out[34]:
2.2758570698017606
In [35]:
r_squared = r2_score(y_test, y_pred)
r_squared
Out[35]:
0.814855389208679
In [36]:
plt.scatter(X_test, y_test)
plt.plot(X_test, 6.948 + 0.054 * X_test, 'r')
plt.show()
In [ ]: